Loading the dataset
library(httr)
##
## Attaching package: 'httr'
## The following object is masked from 'package:plotly':
##
## config
library(jsonlite)
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
get_all_inspections = function(url) {
all_inspections = vector("list", length = 0)
loop_index = 1
chunk_size = 50000
DO_NEXT = TRUE
while (DO_NEXT) {
message("Getting data, page ", loop_index)
all_inspections[[loop_index]] =
GET(url,
query = list(`$order` = "zipcode",
`$limit` = chunk_size,
`$offset` = as.integer((loop_index - 1) * chunk_size)
)
) %>%
content("text") %>%
fromJSON() %>%
as_tibble()
DO_NEXT = dim(all_inspections[[loop_index]])[1] == chunk_size
loop_index = loop_index + 1
}
all_inspections
}
url = "https://data.cityofnewyork.us/resource/43nn-pn8j.json"
nyc_inspections =
get_all_inspections(url) %>%
bind_rows()
## Getting data, page 1
## Getting data, page 2
## Getting data, page 3
## Getting data, page 4
## Getting data, page 5
## Getting data, page 6
Data cleaning
nyc_inspections_cleaned = nyc_inspections %>%
mutate(score = as.numeric(score),
inspection_date = as.Date(inspection_date),
grade_date = as.Date(grade_date),
record_date = as.Date(record_date)) %>%
filter(!is.na(score) & !is.na(boro) & !is.na(inspection_date)) %>%
distinct()
nyc_inspections_cleaned
## # A tibble: 243,200 × 26
## camis boro building street zipcode phone inspection_date critical_flag
## <chr> <chr> <chr> <chr> <chr> <chr> <date> <chr>
## 1 50147715 Manhatt… 2 W 69T… 10000 3474… 2024-04-23 Not Critical
## 2 50132187 Manhatt… NKA CENTR… 10000 6469… 2024-07-23 Critical
## 3 50132187 Manhatt… NKA CENTR… 10000 6469… 2023-02-27 Critical
## 4 50132187 Manhatt… NKA CENTR… 10000 6469… 2023-02-27 Not Critical
## 5 50132187 Manhatt… NKA CENTR… 10000 6469… 2024-07-23 Not Critical
## 6 50132187 Manhatt… NKA CENTR… 10000 6469… 2023-02-27 Not Critical
## 7 50147715 Manhatt… 2 W 69T… 10000 3474… 2024-09-12 Not Critical
## 8 50147715 Manhatt… 2 W 69T… 10000 3474… 2024-04-23 Critical
## 9 50132187 Manhatt… NKA CENTR… 10000 6469… 2023-02-27 Not Critical
## 10 50147715 Manhatt… 2 W 69T… 10000 3474… 2024-04-23 Critical
## # ℹ 243,190 more rows
## # ℹ 18 more variables: record_date <date>, dba <chr>,
## # cuisine_description <chr>, action <chr>, violation_code <chr>,
## # violation_description <chr>, score <dbl>, inspection_type <chr>,
## # latitude <chr>, longitude <chr>, community_board <chr>,
## # council_district <chr>, census_tract <chr>, bin <chr>, bbl <chr>,
## # nta <chr>, grade <chr>, grade_date <date>
nyc_inspections_cleaned %>%
mutate(text_label = str_c("Borough:", boro, "\nScore:", score))%>%
plot_ly(x = ~inspection_date, y = ~score, type = 'scatter', mode = 'markers',
color = ~score, text = ~text_label, alpha = 0.5) %>%
layout(title = "Scatterplot of Scores Over Inspection Dates",
xaxis = list(title = "Inspection Date"),
yaxis = list(title = "Score"))